import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
da=pd.read_csv('B:\\Python Learnings\\Codes\\NHANES_dataset\\nhanes_2015_2016.csv')
da.head(2)
# help(sns.regplot)
sns.regplot(x='BMXARML',y='BMXLEG',data=da,dropna=True,scatter_kws={"alpha": 0.4},marker='+',color='red')
# p.map(plt.scatter,x,y,color='green')
# help(sns.jointplot)
sns.jointplot(x='BMXARML',y='BMXLEG',data=da,kind='kde', dropna=True)
#conclusion of above two statements
#relation between the blood pressure in humans
sns.set_palette("Paired")
sns.jointplot(x='BPXSY1',y='BPXDI1',data=da,dropna= True,kind='kde')
sns.jointplot(x='BPXSY1',y='BPXSY2',data=da,dropna= True,kind='kde')
#high correlation between the data can be observed.(bp of same person few mins apart: seems obvious)
da['RIAGENDRX']=da.RIAGENDR.replace({1:'Male',2:'Female'})
# # help(sns.FacetGrid)
sns.FacetGrid(da,row="RIAGENDRx").map(plt.scatter, "BMXLEG","BMXARML",alpha=0.4,marker='+',color='green').add_legend()
print(da.loc[da.RIAGENDRx=="Male", ["BMXLEG", "BMXARML"]].dropna().corr())
print(da.loc[da.RIAGENDRx=="Female", ["BMXLEG", "BMXARML"]].dropna().corr())
# check the use of pair plot
sns.FacetGrid(da,col='RIDRETH1',row='RIAGENDRx').map(plt.scatter,"BMXLEG","BMXARML",alpha=0.6,marker='+',color='red').add_legend()
#Categorical bivariate data
da["DMDEDUC2x"]=da.DMDEDUC2.replace({1: "<9", 2: "9-11", 3: "HS/GED", 4: "Some college/AA", 5: "College",
7: "Refused", 9: "Don't know"})
da["DMDMARTLx"] = da.DMDMARTL.replace({1: "Married", 2: "Widowed", 3: "Divorced", 4: "Separated", 5: "Never married",
6: "Living w/partner", 77: "Refused"})
db=da.loc[(da.DMDEDUC2x!="Don't Know")&(da.DMDMARTLx!="refused"),:]
da.head(3)
db.head(2)
Plan to create contingency table with respect to education status and marriage status
# help(pd.crosstab)
# x=pd.crosstab(db.DMDEDUC2x,da.DMDMARTLx,da.RIAGENDRx)
x=pd.crosstab(db.DMDEDUC2x,da.DMDMARTLx)
x
x.apply(lambda z: z/z.sum(), axis=1)
x.apply(lambda z: z/z.sum(), axis=0)
# db.groupby(["RIAGENDRx","DMDEDUC2x","DMDMARTLx","DMDCITZN"]).size().unstack().fillna(0).apply(lambda x: x/x.sum(),axis=1)
db.groupby(["RIAGENDRx","DMDEDUC2x","DMDMARTLx",]).size().unstack().fillna(0).apply(lambda x: x/x.sum(),axis=1)
dx = db.loc[(db.RIDAGEYR >= 40) & (db.RIDAGEYR < 50)]
a = dx.groupby(["RIAGENDRx", "DMDEDUC2x", "DMDMARTLx"]).size().unstack().fillna(0).apply(lambda x: x/x.sum(), axis=1)
dx = db.loc[(db.RIDAGEYR >= 50) & (db.RIDAGEYR < 60)]
b = dx.groupby(["RIAGENDRx", "DMDEDUC2x", "DMDMARTLx"]).size().unstack().fillna(0).apply(lambda x: x/x.sum(), axis=1)
print(a.loc[:, ["Married"]].unstack())
print("")
print(b.loc[:, ["Married"]].unstack())
plt.figure(figsize=(12, 4))
a = sns.boxplot(db.DMDMARTLx, db.RIDAGEYR)
plt.figure(figsize=(12, 4))
a = sns.violinplot(da.DMDMARTLx, da.RIDAGEYR)
sns.pairplot(da)
sns.pairplot(db)